Air Quality Module

Author: Jennifer Atkins

This is an example of plotting the correlation between two different types of air quality data from one csv file.


In [1]:
# Plotting related python libraries
import matplotlib.pyplot as plt

# Standard csv python library
import csv

# Main python library for mathematical calculations
import numpy as np

# Python library for calculating the p value in correlation
from scipy.stats.stats import pearsonr

In [2]:
def correlation_coefficient(xdata,ydata):
    #Calculates the correlation coefficient between the two data types
    # 'x' and 'y' data respresent the two data sets being compared
	xmean = np.mean(xdata) #Calculating the average in each data set
	ymean = np.mean(ydata)
	xsigma = np.sqrt(np.var(xdata)) #Calculating the standard deviation in each data
	ysigma = np.sqrt(np.var(ydata))
	xysums = 0
	for i in range(len(xdata)):
		xdiff = xdata[i] - xmean
		ydiff = ydata[i] - ymean
		xysums = xdiff * ydiff +xysums
	stnddevs = xsigma * ysigma
	coeff = xysums/stnddevs/len(xdata)
	return coeff

In [ ]:
#Opens csv file chosen by the user
user_file = input("File Name: ")
results = csv.reader(open(user_file), delimiter=',')

In [ ]:
#Tells user that each number (1-9) represents a collumn in the data file, the collumn numbers correspond with one type of data
print("KEY: 1 = 0.3um, 2 = 0.5um, 3 = 1.0um, 4 = 2.5um, 5 = 5.0um, 6 = 10um, 7 = PM 1.0, 8 = PM 2.5, 9 = PM 10")

#User indicates which data types to compare
data_a = int(input("Particulate Size 1 (Enter a number between 1 and 9): "))
data_b = int(input("Particulate Size 2 (Enter a different number between 1 and 9): "))

In [ ]:
labelA = []
labelB = []

#Depending on which data type chosen, the types' names are appended to the x/y-axis labels
if data_a == 1:
	labelA.append("0.3 um")
elif data_a == 2:
	labelA.append("0.5 um")
elif data_a == 3:
	labelA.append("1.0 um")
elif data_a == 4:
	labelA.append("2.5 um")
elif data_a == 5:
	labelA.append("5.0 um")
elif data_a == 6:
	labelA.append("10 um")
elif data_a == 7:
	labelA.append("PM 1.0")
elif data_a == 8:
	labelA.append("PM 2.5")
elif data_a == 9:
	labelA.append("PM 10")

if data_b == 1:
	labelB.append("0.3 um")
elif data_b == 2:
	labelB.append("0.5 um")
elif data_b == 3:
	labelB.append("1.0 um")
elif data_b == 4:
	labelB.append("2.5 um")
elif data_b == 5:
	labelB.append("5.0 um")
elif data_b == 6:
	labelB.append("10 um")
elif data_b == 7:
	labelB.append("PM 1.0")
elif data_b == 8:
	labelB.append("PM 2.5")
elif data_b == 9:
	labelB.append("PM 10")

#Converts list to string
Alabel = ''.join(labelA)
Blabel = ''.join(labelB)

In [ ]:
ValA = []
ValB = []

row_counter= 0
for r in results:
    #Skip first row (Row that specifies fields)
    row_counter += 1
    if row_counter>1:
        #Append each column in CSV to a separate list
        ValA.append(int(r[data_a])) #data_a/b stands for the collumn number chosen by user
        ValB.append(int(r[data_b]))

In [ ]:
#Choose the number of data points to combine and average
n_merge = int(input("n data points to combine:"))
ndata_a = len(ValA)
ndata_b = len(ValB)
nsum_data_a= int(ndata_a/n_merge)
nsum_data_b= int(ndata_b/n_merge)

In [ ]:
data_ave_a = []
data_ave_b = []
data_unc_a = []
data_unc_b = []

for i in range(nsum_data_a):
    #Calculate the mean and standard deviation
	idata = ValA[i*n_merge:(i+1)*n_merge]
	idata_array = np.asarray(idata) #Convert 'idata' to a numpy array
	aqmeana = np.mean(idata_array)
	aqsigmaA = np.sqrt(np.var(idata_array))
	data_ave_a.append(aqmeana)
	data_unc_a.append(aqsigmaA)

for i in range(nsum_data_b):
	idata = ValB[i*n_merge:(i+1)*n_merge]
	idata_array = np.asarray(idata)
	aqmeanb = np.mean(idata_array)
	aqsigmaB = np.sqrt(np.var(idata_array))
	data_ave_b.append(aqmeanb)
	data_unc_b.append(aqsigmaB)

In [ ]:
#Calculate the correlation coefficient and p value of the two data types
correlation_values = pearsonr(data_ave_a, data_ave_b)
p_value = ("p Value =", correlation_values[1])

corr_coeff = correlation_coefficient(np.asarray(data_ave_a),np.asarray(data_ave_b))
corr_statemnt = "Correlation coefficient = ", corr_coeff

In [ ]:
#Plot the correlation
plt.figure(1)
plt.plot(data_ave_a, data_ave_b, "b.")
plt.xlabel(Alabel)
plt.ylabel(Blabel)
file_title = "Air Quality Correlation Results"
plt.annotate(corr_statemnt, xy=(0, 1), xytext=(12, -12), va='top', #Print correlation coefficient in top left corner of graph
	xycoords='axes fraction', textcoords='offset points')
plt.annotate(p_value, xy=(0, .94), xytext=(12, -12), va='top', #Print p value underneath the correlation coeff.
	xycoords='axes fraction', textcoords='offset points')
plt.title(file_title)

plt.show()